This script measure teh detachment score for genes and it has been used to emasure NL association score for ABCB1 in many differnt conditions performe by Anna G. Manjon at B5 nzlae

suppressMessages(suppressWarnings(library(GenomicRanges)))


suppressMessages(suppressWarnings(library(RColorBrewer)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(ggbeeswarm)))
suppressMessages(suppressWarnings(library(limma)))
suppressMessages(suppressWarnings(library(edgeR)))
suppressMessages(suppressWarnings(library(grid)))
library(VennDiagram)
## Loading required package: futile.logger
library(stringr)
library(khroma)
library(dtplyr)
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")
library("rtracklayer")
library("ggpubr")
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:VennDiagram':
## 
##     rotate
library("ggrastr")
#prepare output
output_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/SGM20220725_Anna_New_Taxol_Clones"

dir.create(output_dir, showWarnings = FALSE)
# Parameter to extend bins to a certain distance (bases) for a more accurate 
# lamina interaction score
ext<- 1e4
#Get the sample data frame prepared

input_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc"
#Use samples from a previous document


samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
#samples_Tom <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/tracks/normalized/bin-20kb", pattern="LaminB")


samples <-as.data.frame(samples)

#samples$sample <- gsub("-10kb", "-gatc", samples$sample)

samples.lmnb2<- samples$sample

n<- nrow(samples)
# Read Chromosome size
chrom_sizes<-read.table("/DATA/scratch/usr/t.v.schaik/data/genomes/GRCh38/hg38.chrom.sizes")
names(chrom_sizes)<- c("seqnames", "length")
row.names(chrom_sizes)<-chrom_sizes$seqnames
#Prepare output files
df.count.file<- file.path(output_dir, "df_count.rds")
genes.file<- file.path(output_dir, "genes.rds")
gene.counts.file<- file.path(output_dir, "genes_counts.rds")
### 1) Count reads in genes

# ts220203 - improve Stefano's script

# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster

# 1) Read count data frame

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.3     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks BiocGenerics::combine()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()
read_sample_and_dam <- function(x, 
                                input_dir = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc") {
  
  # read sample and dam separately
  sample <- read_tsv(file.path(input_dir,
                               x),
                     show_col_types = F,
                     col_names = c("seqnames",
                                   "start",
                                   "end",
                                   x),
                     col_types = c(seqnames = "-",
                                   start = "-",
                                   end = "-"))
  
  dam <- read_tsv(file.path(input_dir,
                            gsub("Lmnb2", "Dam", x)),
                  show_col_types = F,
                  col_names = c("seqnames",
                                "start",
                                "end",
                                paste0(x, "_Dam")),
                  col_types = c(seqnames = "-",
                                start = "-",
                                end = "-"))
  
  # combine
  sample <- bind_cols(sample, dam)
  
  # and return
  sample
  
}

if (! file.exists(df.count.file)) {
  
  # first, load bins
  # add +1 to the start, as this was bed-based
  bins <- read_tsv(file.path(input_dir,
                             samples.lmnb2[1]),
                   col_names = c("seqnames",
                                 "start",
                                 "end")) %>%
    dplyr::select(1:3) %>%
    mutate(start = start + 1)
  
  # load counts
  tib_count <- purrr::map(samples.lmnb2,
                           read_sample_and_dam) %>%
    purrr::reduce(bind_cols)
  
  # combine the two
  tib_count <- bind_cols(bins, tib_count)
  
  # convert to GRanges and set seqlevels
  df.count <- as(tib_count, "GRanges")
  
  # Set seqlevels
  seqlevels(df.count, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
  seqlengths(df.count) <- chrom_sizes[seqlevels(df.count), "length"]
  
  saveRDS(df.count, df.count.file)
  
} else {
  df.count <- readRDS(df.count.file)
}
#I need the gene locations to count the data
genes<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes <- genes[genes$type=="gene"]
 
#Filter for protein coding genes
genes<-genes[genes$gene_type == "protein_coding"]
 
#filter for chromosomes
genes <- genes[seqnames(genes) %in% c(paste0("chr", 1:22), "chrX")]
 
#Set seqlevels
seqlevels(genes, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes)<- chrom_sizes[seqlevels(genes), "length"]
 
#First extend the genes
genes.query <- genes
mcols(genes.query)<- NULL
start(genes.query)<- start(genes.query) - ext
end(genes.query)<- end(genes.query) + ext
 
#Overlap genes with the counts
ovl<-findOverlaps(genes.query, df.count)
 
#Get the number of GATC fragment for every gene query
mcols(genes) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
 
#Get the summed signal for every gene
gene.counts<- genes
mcols(gene.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count))]<-NA
 
 
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
  rename_all(~ c("idx_genes", "idx_gatc")) %>%
  as_tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
tib <- bind_cols(tib,
                 as.tibble(mcols(df.count[tib$idx_gatc]))) #as_tibble()
 
tib_summarise <- tib %>%
  gather(key, value, -contains("idx"))
 
tib_summarise <- lazy_dt(tib_summarise) %>%
  group_by(idx_genes, key) %>%
  summarise(count = sum(value)) %>%
  ungroup() %>%
  mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count)), "-", "."))) %>%
  as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes'. You can override using the
## `.groups` argument.
tib_genes <- tib_summarise %>%
  spread(key, count)
 
mcols(gene.counts) <- tib_genes %>%
  dplyr::select(-idx_genes)
 
names(mcols(gene.counts)) <- names(mcols(df.count))
saveRDS(genes, genes.file)
saveRDS(gene.counts, gene.counts.file)
df.counts<-readRDS(df.count.file)
genes<- readRDS(genes.file)
gene.counts<- readRDS(gene.counts.file)

2) Normalize gene counts

Next, we would like to normalize the gene counts for various parameters:

Absolute read filtering

# To-do: filter genes for having too few reads?
#        alternatively, filter for number of GATC fragments overlapping a gene?

idx<- rowSums(as(mcols(gene.counts), "data.frame") >=10) >=2
gene.counts <- gene.counts[idx]
genes <- genes[idx]

The library size

library_size <- colSums(as(mcols(df.count), "data.frame"))
library_size
##          pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz 
##                                             9146819 
##      pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            11628140 
##          pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz 
##                                            16015780 
##      pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            15530112 
##         pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz 
##                                            23983578 
##     pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            20015832 
##         pADamID-iCA3_c7_r2_Lmnb2-gatc.counts.txt.gz 
##                                            23754126 
##     pADamID-iCA3_c7_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            12392542 
##     pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz 
##                                            16280849 
## pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            19601103 
##     pADamID-iCA3_c7_TXR_r2_Lmnb2-gatc.counts.txt.gz 
##                                            26154121 
## pADamID-iCA3_c7_TXR_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            26154121 
##           pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz 
##                                            16154243 
##       pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            20956293 
##           pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz 
##                                            17242299 
##       pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            16661470 
##       pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz 
##                                            23549410 
##   pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            19472337 
##       pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz 
##                                            33529924 
##   pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            18239916 
##            pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz 
##                                            13791640 
##        pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            14815531 
##            pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz 
##                                            18837611 
##        pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            18477002 
##            pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz 
##                                            13539117 
##        pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            11677399 
##            pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz 
##                                            15468417 
##        pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            16265507 
##            pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz 
##                                            14819533 
##        pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            13951858 
##            pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz 
##                                            15712352 
##        pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            14335555 
##            pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz 
##                                            14514591 
##        pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            13510895 
##            pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz 
##                                            16824129 
##        pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            13196807 
##            pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz 
##                                            19363471 
##        pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            16382760 
##            pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz 
##                                            17710880 
##        pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz_Dam 
##                                            14072587
# Also combined the counts (Dam + lamina) and have them separate
gene.counts.combined<- gene.counts.lamina <- gene.counts.dam<- gene.counts
mcols(gene.counts.combined) <- (as(mcols(gene.counts)[, seq(1, 2*n, 2)], "data.frame")+
                                  as(mcols(gene.counts)[, seq(2, 2*n, 2)], "data.frame"))
mcols(gene.counts.lamina) <-mcols(gene.counts)[,seq(1, 2*n,2)]
mcols(gene.counts.dam) <-mcols(gene.counts)[,seq(2, 2*n,2)]

#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
gene.counts.norm<- gene.counts
mcols(gene.counts.norm)<- t(t(as(mcols(gene.counts), "data.frame"))/ norm_factor)

#Also create a 1M mean counts of Dam and Lamina per experiment
gene.counts.norm.combined<- gene.counts.norm
mcols(gene.counts.norm.combined) <- (as(mcols(gene.counts.norm) [,seq(1, 2*n, 2)], "data.frame")+
                                      as(mcols(gene.counts.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
  # This function expects a data frame that is composed of lamina and Dam-only
  # columns, in that order. It simply determines the log2 ratio of the two for
  # every pair of columns. A pseudocount is added to prevent problems.
  n <- ncol(df)
  df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
  df.norm
}

#Normalize LB over Dam (ratio2)
genes.norm <- gene.counts
mcols(genes.norm) <- NormalizeDamID(as(mcols(gene.counts.norm), "data.frame"))
mcols(genes.norm) <- mcols(genes.norm)[, samples$sample]

# Plot the distributions for the various samples This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
 #    xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes")

#for (i in 1:n) {
 # lines(density(mcols(genes.norm)[, i],na.rm = TRUE,), col = samples, lwd = 2,
       # lty = as.numeric(samples.df$integration[i]),)
#}

#legend("topright", legend = levels(samples), 
     #  pch = 19, col = 1:length(levels(samples)))
#Scale the data to the same mean and stdev
genes.norm.scaled<-genes.norm
mcols(genes.norm.scaled)<- scale(as(mcols(genes.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples, file.path(output_dir, "samples.rds"))
saveRDS(genes.norm.scaled, file.path(output_dir, "genes_norm_filtered.rds"))

# First, change the mcols of the genes
mcols(genes) <- mcols(genes)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes, file.path(output_dir, "genes_filtered.rds"))
# Samples data frame - basically metadata
padamid.samples<- readRDS(file.path(output_dir,
                                    "samples.rds"))

# Filtered genes used in the pA-DamID analysis
padamid.genes <- readRDS(file.path(output_dir, 
                                   "genes_filtered.rds"))

# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
                                    #  "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
                                  "genes_norm_filtered.rds"))

names(mcols(padamid.norm))<-padamid.samples$samples

class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame") 
padamid_genes_df<- as(mcols(padamid.genes), "data.frame")
#binding gene name and danID score
damid_score_gene<-cbind(padamid_genes_df, padamid_norm_df)


Taxol_damid_score_gene<-damid_score_gene %>%mutate(DPURO_3=(`pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(TXR3=(`pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(TXR4=(`pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(RPE0=(`pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(TXR5=(`pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(TXR6=(`pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(ANCHOR_P=(`pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            mutate(ANCHOR_P_TXR=(`pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
                                            dplyr::rename(ANCHOR_c7_r1=`pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz`)%>%
                                            dplyr::rename(ANCHOR_c7_TXR_r1=`pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz`)

ABCB1<-Taxol_damid_score_gene%>%filter(gene_name=="ABCB1")
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR3.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= DPURO_3, y= TXR3))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone3,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= DPURO_3, y= TXR3), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR4.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= DPURO_3, y= TXR4))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone4,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= DPURO_3, y= TXR4), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR5.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= RPE0, y= TXR5))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone5,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE0, y= TXR5), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR6.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= RPE0, y= TXR6))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone6,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE0, y= TXR6), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_Anchor3_parental.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= ANCHOR_P, y= ANCHOR_P_TXR))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Anchor_Parental,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= ANCHOR_P, y= ANCHOR_P_TXR), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_Anchor3_Clone7.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= ANCHOR_c7_r1, y= ANCHOR_c7_TXR_r1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Anchor_clone 7,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= ANCHOR_c7_r1, y= ANCHOR_c7_TXR_r1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

Now I want to use old data from Anna and Tom to have an overview of all TXR clones

#prepare output
output_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/SGM20220725_Anna_New_Taxol_Clones"

dir.create(output_dir, showWarnings = FALSE)
# Parameter to extend bins to a certain distance (bases) for a more accurate 
# lamina interaction score
ext<- 1e4
#Get the ample data frame prepared

input_dir<- "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc"
#Use samples from a previous document


#samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
samples_Tom <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc", pattern="LaminB2")


samples_Tom <-as.data.frame(samples_Tom)

#samples_Tom$sample <- gsub("-10kb", "-gatc", samples_Tom$sample)

samples_Tom.lmnb2<- samples_Tom$sample

n<- nrow(samples_Tom)
# Read Chromosome size
chrom_sizes<-read.table("/DATA/scratch/usr/t.v.schaik/data/genomes/GRCh38/hg38.chrom.sizes")
names(chrom_sizes)<- c("seqnames", "length")
row.names(chrom_sizes)<-chrom_sizes$seqnames
#Prepare output files
df.count_Tom.file<- file.path(output_dir, "df_count_Tom.rds")
genes_Tom.file<- file.path(output_dir, "genes_Tom.rds")
gene_Tom.counts.file<- file.path(output_dir, "genes_Tom_counts.rds")
### 1) Count reads in genes_Tom

# ts220203 - improve Stefano's script

# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster

# 1) Read count data frame

library(tidyverse)

read_sample_and_dam <- function(x, 
                                input_dir = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc") {
  
  # read sample and dam separately
  sample <- read_tsv(file.path(input_dir,
                               x),
                     show_col_types = F,
                     col_names = c("seqnames",
                                   "start",
                                   "end",
                                   x),
                     col_types = c(seqnames = "-",
                                   start = "-",
                                   end = "-"))
  
  dam <- read_tsv(file.path(input_dir,
                            gsub("LaminB2", "Dam", x)),
                  show_col_types = F,
                  col_names = c("seqnames",
                                "start",
                                "end",
                                paste0(x, "_Dam")),
                  col_types = c(seqnames = "-",
                                start = "-",
                                end = "-"))
  
  # combine
  sample <- bind_cols(sample, dam)
  
  # and return
  sample
  
}

if (! file.exists(df.count_Tom.file)) {
  
  # first, load bins
  # add +1 to the start, as this was bed-based
  bins <- read_tsv(file.path(input_dir,
                             samples_Tom.lmnb2[1]),
                   col_names = c("seqnames",
                                 "start",
                                 "end")) %>%
    dplyr::select(1:3) %>%
    mutate(start = start + 1)
  
  # load counts
  tib_count <- purrr::map(samples_Tom.lmnb2,
                           read_sample_and_dam) %>%
    purrr::reduce(bind_cols)
  
  # combine the two
  tib_count <- bind_cols(bins, tib_count)
  
  # convert to GRanges and set seqlevels
  df.count_Tom <- as(tib_count, "GRanges")
  
  # Set seqlevels
  seqlevels(df.count_Tom, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
  seqlengths(df.count_Tom) <- chrom_sizes[seqlevels(df.count_Tom), "length"]
  
  saveRDS(df.count_Tom, df.count_Tom.file)
  
} else {
  df.count_Tom <- readRDS(df.count_Tom.file)
}
    /DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-20kb/A549_61_LaminB2_R1-20kb.counts.txt.gz
#I need the gene locations to count the data
genes_Tom<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes_Tom <- genes_Tom[genes_Tom$type=="gene"]
 
#Filter for protein coding genes_Tom
genes_Tom<-genes_Tom[genes_Tom$gene_type == "protein_coding"]
 
#filter for chromosomes
genes_Tom <- genes_Tom[seqnames(genes_Tom) %in% c(paste0("chr", 1:22), "chrX")]
 
#Set seqlevels
seqlevels(genes_Tom, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes_Tom)<- chrom_sizes[seqlevels(genes_Tom), "length"]
 
#First extend the genes_Tom
genes_Tom.query <- genes_Tom
mcols(genes_Tom.query)<- NULL
start(genes_Tom.query)<- start(genes_Tom.query) - ext
end(genes_Tom.query)<- end(genes_Tom.query) + ext
 
#Overlap genes_Tom with the counts
ovl<-findOverlaps(genes_Tom.query, df.count_Tom)
 
#Get the number of GATC fragment for every gene query
mcols(genes_Tom) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes_Tom)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
 
#Get the summed signal for every gene
gene_Tom.counts<- genes_Tom
mcols(gene_Tom.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count_Tom))]<-NA
 
 
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
  rename_all(~ c("idx_genes_Tom", "idx_gatc")) %>%
  as_tibble()
 
tib <- bind_cols(tib,
                 as.tibble(mcols(df.count_Tom[tib$idx_gatc]))) #as_tibble()
 
tib_summarise <- tib %>%
  gather(key, value, -contains("idx"))
 
tib_summarise <- lazy_dt(tib_summarise) %>%
  group_by(idx_genes_Tom, key) %>%
  summarise(count = sum(value)) %>%
  ungroup() %>%
  mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count_Tom)), "-", "."))) %>%
  as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes_Tom'. You can override using the
## `.groups` argument.
tib_genes_Tom <- tib_summarise %>%
  spread(key, count)
 
mcols(gene_Tom.counts) <- tib_genes_Tom %>%
  dplyr::select(-idx_genes_Tom)
 
names(mcols(gene_Tom.counts)) <- names(mcols(df.count_Tom))
saveRDS(genes_Tom, genes_Tom.file)
saveRDS(gene_Tom.counts, gene_Tom.counts.file)

df.count_Tom.file<- file.path(output_dir, “df_count_Tom.rds”) genes_Tom.file<- file.path(output_dir, “genes_Tom.rds”) gene_Tom.counts.file<- file.path(output_dir, “genes_Tom_counts.rds”)

df.counts_Tom<-readRDS(df.count_Tom.file)
genes_Tom<- readRDS(genes_Tom.file)
genes_Tom.count<- readRDS(gene_Tom.counts.file)

2) Normalize gene counts

Next, we would like to normalize the gene counts for various parameters:

Absolute read filtering

# To-do: filter genes_Tom for having too few reads?
#        alternatively, filter for number of GATC fragments overlapping a gene?

idx<- rowSums(as(mcols(genes_Tom.count), "data.frame") >=10) >=2
genes_Tom.count <- genes_Tom.count[idx]
genes_Tom <- genes_Tom[idx]

The library size

library_size <- colSums(as(mcols(df.count_Tom), "data.frame"))
library_size
##                             A549_61_LaminB2_R1-gatc.counts.txt.gz 
##                                                           7128669 
##                         A549_61_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5070144 
##                             A549_61_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5287346 
##                         A549_61_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           4105635 
##                             A549_71_LaminB2_R1-gatc.counts.txt.gz 
##                                                           3413463 
##                         A549_71_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           3808615 
##                             A549_71_LaminB2_R2-gatc.counts.txt.gz 
##                                                           9883893 
##                         A549_71_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           4371685 
##                   RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz 
##                                                           6648108 
##               RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           6648108 
##                   RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz 
##                                                           6901312 
##               RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                           6901312 
##                   RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz 
##                                                           7213865 
##               RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           7213865 
##                   RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz 
##                                                           7230495 
##               RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                           7230495 
##                   RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz 
##                                                           7109704 
##               RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           7109704 
##                   RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz 
##                                                          10342949 
##               RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                          10342949 
##                 RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz 
##                                                           6974508 
##             RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           6974508 
##                 RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz 
##                                                           7428567 
##             RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                           7428567 
##                      RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz 
##                                                           5722620 
##                  RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           5722620 
##                      RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz 
##                                                           6491458 
##                  RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                           6491458 
##                       RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4846081 
##                   RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5020601 
##                       RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4753605 
##                   RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           5696142 
##                       RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5017152 
##                   RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           6023804 
##                       RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4339456 
##                   RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           5869564 
##                         RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz 
##                                                           6199078 
##                     RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz_Dam 
##                                                           6199078 
##                         RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz 
##                                                           6490134 
##                     RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz_Dam 
##                                                           6490134 
##                         RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz 
##                                                           6208211 
##                     RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           6837316 
##                         RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4465616 
##                     RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6696393 
##                  RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz 
##                                                           6522273 
##              RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                                           6522273 
##                  RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz 
##                                                           6066689 
##              RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                                           6066689 
##                  RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5371421 
##              RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4856657 
##                  RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5869513 
##              RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6443064 
##              RPE_iCut_ABCB1_cr6_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2664729 
##          RPE_iCut_ABCB1_cr6_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4582596 
##              RPE_iCut_ABCB1_cr6_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           9355652 
##          RPE_iCut_ABCB1_cr6_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                          12557773 
##              RPE_iCut_ABCB1_cr6_10h_LaminB2_R3-gatc.counts.txt.gz 
##                                                           7421436 
##          RPE_iCut_ABCB1_cr6_10h_LaminB2_R3-gatc.counts.txt.gz_Dam 
##                                                           8532793 
##              RPE_iCut_ABCB1_cr6_10h_LaminB2_R4-gatc.counts.txt.gz 
##                                                           8817767 
##          RPE_iCut_ABCB1_cr6_10h_LaminB2_R4-gatc.counts.txt.gz_Dam 
##                                                           5515521 
##              RPE_iCut_ABCB1_cr6_24h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2158312 
##          RPE_iCut_ABCB1_cr6_24h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5919371 
##              RPE_iCut_ABCB1_cr6_24h_LaminB2_R2-gatc.counts.txt.gz 
##                                                          12254511 
##          RPE_iCut_ABCB1_cr6_24h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                          12260142 
##         RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4493792 
##     RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8370230 
##         RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5340283 
##     RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7517688 
##         RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4451419 
##     RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5817951 
##         RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4072549 
##     RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           8075793 
##        RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2309130 
##    RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4043287 
##        RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           2605713 
##    RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           3556261 
##          RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4572423 
##      RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           7281427 
##          RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           2893637 
##      RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           3978089 
##        RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4430998 
##    RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8999909 
##        RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           2806431 
##    RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           4361892 
##     RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4298643 
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4303820 
##     RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4322053 
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           3647708 
##        RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2779896 
##    RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           2945057 
##        RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           8585892 
##    RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7505476 
##             RPE_iCut_ADAM22_cr3_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2242930 
##         RPE_iCut_ADAM22_cr3_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4981281 
##             RPE_iCut_ADAM22_cr3_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           9532467 
##         RPE_iCut_ADAM22_cr3_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           8548987 
##             RPE_iCut_ADAM22_cr3_24h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           1901805 
##         RPE_iCut_ADAM22_cr3_24h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4098346 
##             RPE_iCut_ADAM22_cr3_24h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           8849699 
##         RPE_iCut_ADAM22_cr3_24h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           4894243 
##        RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4191516 
##    RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           7060830 
##        RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5958699 
##    RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7537442 
##         RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5246872 
##     RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           7984126 
##         RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz 
##                                                           6170806 
##     RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7825084 
##         RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz 
##                                                           7364115 
##     RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5998033 
##         RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz 
##                                                           8677098 
##     RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           4743951 
##                      RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz 
##                                                           7001508 
##                  RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8885964 
##                      RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz 
##                                                           4889914 
##                  RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6486412 
##                      RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz 
##                                                           8118928 
##                  RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz_Dam 
##                                                           8106445 
##                       RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz 
##                                                           9016169 
##                   RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           6816124 
##                       RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz 
##                                                           6004829 
##                   RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6600703 
##                 RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz 
##                                                           6571922 
##             RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4560595 
##                 RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz 
##                                                          15686330 
##             RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6834085 
##                      RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz 
##                                                           5837763 
##                  RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz_Dam 
##                                                           5837763 
##                      RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz 
##                                                           6340729 
##                  RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz_Dam 
##                                                           6340729 
##                      RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz 
##                                                           6228819 
##                  RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                          13205669 
##                      RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz 
##                                                           7378437 
##                  RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           8854090 
##                     RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz 
##                                                          10624186 
##                 RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5262561 
##                     RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5842671 
##                 RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           5978557 
##                    RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5481121 
##                RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           6539260 
##                    RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5141371 
##                RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           5367725 
##             RPE_iCut_pool9_10h_ATMi_LaminB2_R1-gatc.counts.txt.gz 
##                                                           6852087 
##         RPE_iCut_pool9_10h_ATMi_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8035263 
##             RPE_iCut_pool9_10h_ATMi_LaminB2_R2-gatc.counts.txt.gz 
##                                                           6365723 
##         RPE_iCut_pool9_10h_ATMi_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6804968 
##           RPE_iCut_pool9_10h_DNAPKi_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5011125 
##       RPE_iCut_pool9_10h_DNAPKi_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8622881 
##           RPE_iCut_pool9_10h_DNAPKi_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5774369 
##       RPE_iCut_pool9_10h_DNAPKi_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6553886 
##                  RPE_iCut_pool9_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           4549861 
##              RPE_iCut_pool9_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5333752 
##                  RPE_iCut_pool9_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           6666603 
##              RPE_iCut_pool9_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6489322 
##                  RPE_iCut_pool9_10h_LaminB2_R3-gatc.counts.txt.gz 
##                                                           5611353 
##              RPE_iCut_pool9_10h_LaminB2_R3-gatc.counts.txt.gz_Dam 
##                                                           5893908 
##                  RPE_iCut_pool9_10h_LaminB2_R4-gatc.counts.txt.gz 
##                                                           6800960 
##              RPE_iCut_pool9_10h_LaminB2_R4-gatc.counts.txt.gz_Dam 
##                                                           8063952 
##                  RPE_iCut_pool9_24h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           5463941 
##              RPE_iCut_pool9_24h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           7206086 
##                  RPE_iCut_pool9_24h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5487181 
##              RPE_iCut_pool9_24h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7821692 
##             RPE_iCut_pool9tracr_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           7021423 
##         RPE_iCut_pool9tracr_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                          11729929 
##             RPE_iCut_pool9tracr_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5427104 
##         RPE_iCut_pool9tracr_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                          11232355 
##                  RPE_iCut_tracr_10h_LaminB2_R1-gatc.counts.txt.gz 
##                                                           2441927 
##              RPE_iCut_tracr_10h_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           5098431 
##                  RPE_iCut_tracr_10h_LaminB2_R2-gatc.counts.txt.gz 
##                                                          10822728 
##              RPE_iCut_tracr_10h_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           9271605 
##                  RPE_iCut_tracr_10h_LaminB2_R3-gatc.counts.txt.gz 
##                                                           8012049 
##              RPE_iCut_tracr_10h_LaminB2_R3-gatc.counts.txt.gz_Dam 
##                                                           5815176 
##                  RPE_iCut_tracr_10h_LaminB2_R4-gatc.counts.txt.gz 
##                                                           8760328 
##              RPE_iCut_tracr_10h_LaminB2_R4-gatc.counts.txt.gz_Dam 
##                                                           6369574 
##              RPE_iCut_TxR_cloneF6_1_LaminB2_R1-gatc.counts.txt.gz 
##                                                           8056821 
##          RPE_iCut_TxR_cloneF6_1_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           4855944 
##              RPE_iCut_TxR_cloneF6_1_LaminB2_R2-gatc.counts.txt.gz 
##                                                           5153990 
##          RPE_iCut_TxR_cloneF6_1_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           7015188 
##              RPE_iCut_TxR_cloneG6_3_LaminB2_R1-gatc.counts.txt.gz 
##                                                           7532241 
##          RPE_iCut_TxR_cloneG6_3_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           6366250 
##              RPE_iCut_TxR_cloneG6_3_LaminB2_R2-gatc.counts.txt.gz 
##                                                           7788911 
##          RPE_iCut_TxR_cloneG6_3_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6153409 
##                         RPE_iCut_WT_LaminB2_R1-gatc.counts.txt.gz 
##                                                           6522158 
##                     RPE_iCut_WT_LaminB2_R1-gatc.counts.txt.gz_Dam 
##                                                           8772753 
##                         RPE_iCut_WT_LaminB2_R2-gatc.counts.txt.gz 
##                                                           3125946 
##                     RPE_iCut_WT_LaminB2_R2-gatc.counts.txt.gz_Dam 
##                                                           6382989 
##                         RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz 
##                                                           4405577 
##                     RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz_Dam 
##                                                           6483707 
##                         RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz 
##                                                           7454359 
##                     RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz_Dam 
##                                                           8740536
# Also combined the counts (Dam + lamina) and have them separate
genes_Tom.count.combined<- genes_Tom.count.lamina <- genes_Tom.count.dam<- genes_Tom.count
mcols(genes_Tom.count.combined) <- (as(mcols(genes_Tom.count)[, seq(1, 2*n, 2)], "data.frame")+
                                  as(mcols(genes_Tom.count)[, seq(2, 2*n, 2)], "data.frame"))
mcols(genes_Tom.count.lamina) <-mcols(genes_Tom.count)[,seq(1, 2*n,2)]
mcols(genes_Tom.count.dam) <-mcols(genes_Tom.count)[,seq(2, 2*n,2)]

#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
genes_Tom.count.norm<- genes_Tom.count
mcols(genes_Tom.count.norm)<- t(t(as(mcols(genes_Tom.count), "data.frame"))/ norm_factor)

#Also create a 1M mean counts of Dam and Lamina per experiment
genes_Tom.count.norm.combined<- genes_Tom.count.norm
mcols(genes_Tom.count.norm.combined) <- (as(mcols(genes_Tom.count.norm) [,seq(1, 2*n, 2)], "data.frame")+
                                      as(mcols(genes_Tom.count.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
  # This function expects a data frame that is composed of lamina and Dam-only
  # columns, in that order. It simply determines the log2 ratio of the two for
  # every pair of columns. A pseudocount is added to prevent problems.
  n <- ncol(df)
  df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
  df.norm
}

#Normalize LB over Dam (ratio2)
genes_Tom.norm <- genes_Tom.count
mcols(genes_Tom.norm) <- NormalizeDamID(as(mcols(genes_Tom.count.norm), "data.frame"))
mcols(genes_Tom.norm) <- mcols(genes_Tom.norm)[, samples_Tom$sample]
# Plot the distributions for the various samples_Tom This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
  #   xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes_Tom")

#for (i in 1:n) {
#  lines(density(mcols(genes_Tom.norm)[, i],na.rm = TRUE,), col = samples_Tom, lwd = 2,
 #       lty = as.numeric(samples_Tom.df$integration[i]),)
#}

#legend("topright", legend = levels(samples_Tom), 
 #      pch = 19, col = 1:length(levels(samples_Tom)))
#Scale the data to the same mean and stdev
genes_Tom.norm.scaled<-genes_Tom.norm
mcols(genes_Tom.norm.scaled)<- scale(as(mcols(genes_Tom.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples_Tom, file.path(output_dir, "samples_Tom.rds"))
saveRDS(genes_Tom.norm.scaled, file.path(output_dir, "genes_Tom_norm_filtered.rds"))

# First, change the mcols of the genes_Tom
mcols(genes_Tom) <- mcols(genes_Tom)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes_Tom, file.path(output_dir, "genes_Tom_filtered.rds"))
# samples_Tom data frame - basically metadata
padamid.samples_Tom<- readRDS(file.path(output_dir,
                                    "samples_Tom.rds"))

# Filtered genes_Tom used in the pA-DamID analysis
padamid.genes_Tom <- readRDS(file.path(output_dir, 
                                   "genes_Tom_filtered.rds"))

# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
                                    #  "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
                                  "genes_Tom_norm_filtered.rds"))

names(mcols(padamid.norm))<-padamid.samples_Tom$samples_Tom

class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes_Tom)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples_Tom)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame") 
padamid_genes_Tom_df<- as(mcols(padamid.genes_Tom), "data.frame")
#binding gene name and danID score
damid_score_gene_Tom<-cbind(padamid_genes_Tom_df, padamid_norm_df)

Now let’s do the same for LMNB1 antibody.

#Use samples from a previous document


#samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
samples_Tom2 <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc", pattern="LaminB1")


samples_Tom2 <-as.data.frame(samples_Tom2)

#samples_Tom$sample <- gsub("-10kb", "-gatc", samples_Tom$sample)

samples_Tom2.lmnb1<- samples_Tom2$sample

n<- nrow(samples_Tom2)
#Prepare output files
df.count_Tom2.file<- file.path(output_dir, "df_count_Tom2.rds")
genes_Tom2.file<- file.path(output_dir, "genes_Tom2.rds")
gene_Tom2.counts.file<- file.path(output_dir, "genes_Tom2_counts.rds")
### 1) Count reads in genes_Tom2

# ts220203 - improve Stefano's script

# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster

# 1) Read count data frame

library(tidyverse)

read_sample_and_dam <- function(x, 
                                input_dir = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc") {
  
  # read sample and dam separately
  sample <- read_tsv(file.path(input_dir,
                               x),
                     show_col_types = F,
                     col_names = c("seqnames",
                                   "start",
                                   "end",
                                   x),
                     col_types = c(seqnames = "-",
                                   start = "-",
                                   end = "-"))
  
  dam <- read_tsv(file.path(input_dir,
                            gsub("LaminB1", "Dam", x)),
                  show_col_types = F,
                  col_names = c("seqnames",
                                "start",
                                "end",
                                paste0(x, "_Dam")),
                  col_types = c(seqnames = "-",
                                start = "-",
                                end = "-"))
  
  # combine
  sample <- bind_cols(sample, dam)
  
  # and return
  sample
  
}

if (! file.exists(df.count_Tom2.file)) {
  
  # first, load bins
  # add +1 to the start, as this was bed-based
  bins <- read_tsv(file.path(input_dir,
                             samples_Tom2.lmnb1[1]),
                   col_names = c("seqnames",
                                 "start",
                                 "end")) %>%
    dplyr::select(1:3) %>%
    mutate(start = start + 1)
  
  # load counts
  tib_count <- purrr::map(samples_Tom2.lmnb1,
                           read_sample_and_dam) %>%
    purrr::reduce(bind_cols)
  
  # combine the two
  tib_count <- bind_cols(bins, tib_count)
  
  # convert to GRanges and set seqlevels
  df.count_Tom2 <- as(tib_count, "GRanges")
  
  # Set seqlevels
  seqlevels(df.count_Tom2, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
  seqlengths(df.count_Tom2) <- chrom_sizes[seqlevels(df.count_Tom2), "length"]
  
  saveRDS(df.count_Tom2, df.count_Tom2.file)
  
} else {
  df.count_Tom2 <- readRDS(df.count_Tom2.file)
}
    /DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-20kb/A549_61_LaminB2_R1-20kb.counts.txt.gz
#I need the gene locations to count the data
genes_Tom2<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes_Tom2 <- genes_Tom2[genes_Tom2$type=="gene"]
 
#Filter for protein coding genes_Tom2
genes_Tom2<-genes_Tom2[genes_Tom2$gene_type == "protein_coding"]
 
#filter for chromosomes
genes_Tom2 <- genes_Tom2[seqnames(genes_Tom2) %in% c(paste0("chr", 1:22), "chrX")]
 
#Set seqlevels
seqlevels(genes_Tom2, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes_Tom2)<- chrom_sizes[seqlevels(genes_Tom2), "length"]
 
#First extend the genes_Tom2
genes_Tom2.query <- genes_Tom2
mcols(genes_Tom2.query)<- NULL
start(genes_Tom2.query)<- start(genes_Tom2.query) - ext
end(genes_Tom2.query)<- end(genes_Tom2.query) + ext
 
#Overlap genes_Tom2 with the counts
ovl<-findOverlaps(genes_Tom2.query, df.count_Tom2)
 
#Get the number of GATC fragment for every gene query
mcols(genes_Tom2) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes_Tom2)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
 
#Get the summed signal for every gene
gene_Tom2.counts<- genes_Tom2
mcols(gene_Tom2.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count_Tom2))]<-NA
 
 
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
  rename_all(~ c("idx_genes_Tom2", "idx_gatc")) %>%
  as_tibble()
 
tib <- bind_cols(tib,
                 as.tibble(mcols(df.count_Tom2[tib$idx_gatc]))) #as_tibble()
 
tib_summarise <- tib %>%
  gather(key, value, -contains("idx"))
 
tib_summarise <- lazy_dt(tib_summarise) %>%
  group_by(idx_genes_Tom2, key) %>%
  summarise(count = sum(value)) %>%
  ungroup() %>%
  mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count_Tom2)), "-", "."))) %>%
  as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes_Tom2'. You can override using the
## `.groups` argument.
tib_genes_Tom2 <- tib_summarise %>%
  spread(key, count)
 
mcols(gene_Tom2.counts) <- tib_genes_Tom2 %>%
  dplyr::select(-idx_genes_Tom2)
 
names(mcols(gene_Tom2.counts)) <- names(mcols(df.count_Tom2))
saveRDS(genes_Tom2, genes_Tom2.file)
saveRDS(gene_Tom2.counts, gene_Tom2.counts.file)
df.count_Tom2.file<- file.path(output_dir, "df_count_Tom2.rds")
genes_Tom2.file<- file.path(output_dir, "genes_Tom2.rds")
gene_Tom2.counts.file<- file.path(output_dir, "genes_Tom2_counts.rds")
df.counts_Tom2<-readRDS(df.count_Tom2.file)
genes_Tom2<- readRDS(genes_Tom2.file)
genes_Tom2.count<- readRDS(gene_Tom2.counts.file)

2) Normalize gene counts

Next, we would like to normalize the gene counts for various parameters:

Absolute read filtering

# To-do: filter genes_Tom2 for having too few reads?
#        alternatively, filter for number of GATC fragments overlapping a gene?

idx<- rowSums(as(mcols(genes_Tom2.count), "data.frame") >=10) >=2
genes_Tom2.count <- genes_Tom2.count[idx]
genes_Tom2 <- genes_Tom2[idx]

The library size

library_size <- colSums(as(mcols(df.counts_Tom2), "data.frame"))
library_size
##       RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz 
##                                               6648108 
##   RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               7122095 
##       RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz 
##                                               6901312 
##   RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               5807849 
##       RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz 
##                                               7213865 
##   RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               8055325 
##       RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz 
##                                               7230495 
##   RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               7386565 
##       RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz 
##                                               7109704 
##   RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               7596199 
##       RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz 
##                                              10342949 
##   RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               7489433 
##     RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz 
##                                               6974508 
## RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               7375382 
##     RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz 
##                                               7428567 
## RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               7643385 
##          RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz 
##                                               5722620 
##      RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               5329915 
##          RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz 
##                                               6491458 
##      RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               5724573 
##             RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz 
##                                               6199078 
##         RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz_Dam 
##                                               5864288 
##             RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz 
##                                               6490134 
##         RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz_Dam 
##                                               6358818 
##      RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz 
##                                               6522273 
##  RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz_Dam 
##                                               7008463 
##      RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz 
##                                               6066689 
##  RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz_Dam 
##                                               7422449 
##          RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz 
##                                               5837763 
##      RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz_Dam 
##                                               7289557 
##          RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz 
##                                               6340729 
##      RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz_Dam 
##                                               4475611
# Also combined the counts (Dam + lamina) and have them separate
genes_Tom2.count.combined<- genes_Tom2.count.lamina <- genes_Tom2.count.dam<- genes_Tom2.count
mcols(genes_Tom2.count.combined) <- (as(mcols(genes_Tom2.count)[, seq(1, 2*n, 2)], "data.frame")+
                                  as(mcols(genes_Tom2.count)[, seq(2, 2*n, 2)], "data.frame"))
mcols(genes_Tom2.count.lamina) <-mcols(genes_Tom2.count)[,seq(1, 2*n,2)]
mcols(genes_Tom2.count.dam) <-mcols(genes_Tom2.count)[,seq(2, 2*n,2)]

#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
genes_Tom2.count.norm<- genes_Tom2.count
mcols(genes_Tom2.count.norm)<- t(t(as(mcols(genes_Tom2.count), "data.frame"))/ norm_factor)

#Also create a 1M mean counts of Dam and Lamina per experiment
genes_Tom2.count.norm.combined<- genes_Tom2.count.norm
mcols(genes_Tom2.count.norm.combined) <- (as(mcols(genes_Tom2.count.norm) [,seq(1, 2*n, 2)], "data.frame")+
                                      as(mcols(genes_Tom2.count.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
  # This function expects a data frame that is composed of lamina and Dam-only
  # columns, in that order. It simply determines the log2 ratio of the two for
  # every pair of columns. A pseudocount is added to prevent problems.
  n <- ncol(df)
  df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
  df.norm
}

#Normalize LB over Dam (ratio2)
genes_Tom2.norm <- genes_Tom2.count
mcols(genes_Tom2.norm) <- NormalizeDamID(as(mcols(genes_Tom2.count.norm), "data.frame"))
mcols(genes_Tom2.norm) <- mcols(genes_Tom2.norm)[, samples_Tom2$sample]
# Plot the distributions for the various samples_Tom2 This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
#     xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes_Tom2")

#for (i in 1:n) {
 # lines(density(mcols(genes_Tom2.norm)[, i],na.rm = TRUE,), col = samples_Tom2, lwd = 2,
  #      lty = as.numeric(samples_Tom2.df$integration[i]),)
#}

#legend("topright", legend = levels(samples_Tom2), 
    #   pch = 19, col = 1:length(levels(samples_Tom2)))
#Scale the data to the same mean and stdev
genes_Tom2.norm.scaled<-genes_Tom2.norm
mcols(genes_Tom2.norm.scaled)<- scale(as(mcols(genes_Tom2.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples_Tom2, file.path(output_dir, "samples_Tom2.rds"))
saveRDS(genes_Tom2.norm.scaled, file.path(output_dir, "genes_Tom2_norm_filtered.rds"))

# First, change the mcols of the genes_Tom2
mcols(genes_Tom2) <- mcols(genes_Tom2)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes_Tom2, file.path(output_dir, "genes_Tom2_filtered.rds"))
# samples_Tom2 data frame - basically metadata
padamid.samples_Tom2<- readRDS(file.path(output_dir,
                                    "samples_Tom2.rds"))

# Filtered genes_Tom2 used in the pA-DamID analysis
padamid.genes_Tom2 <- readRDS(file.path(output_dir, 
                                   "genes_Tom2_filtered.rds"))

# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
                                    #  "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
                                  "genes_Tom2_norm_filtered.rds"))

names(mcols(padamid.norm))<-padamid.samples_Tom2$samples_Tom2

class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes_Tom2)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples_Tom2)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame") 
padamid_genes_Tom2_df<- as(mcols(padamid.genes_Tom2), "data.frame")
#binding gene name and danID score
damid_score_gene_Tom2<-cbind(padamid_genes_Tom2_df, padamid_norm_df)

damid_score_gene_Tom_LB1_LB2<-inner_join(damid_score_gene_Tom,damid_score_gene_Tom2, by="gene_name")

damid_score_gene_Tom_LB1_LB2<-damid_score_gene_Tom_LB1_LB2[,-(83:84)]
Taxol_damid_score_gene_Tom<-damid_score_gene_Tom_LB1_LB2 %>%
                        mutate(RPE_dPC3_TXR3_LB2=(`RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz`+                                   `RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
                        mutate(RPE_dPC3_TXR4_LB2=(`RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz`+                                   `RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
                        mutate(RPE_dPC3_WT_LB2=(`RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz`+                                        `RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz`)/2)%>% 
                        mutate(RPE_dPC3_WT_LB1=(`RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz`+                                        `RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz`)/2)%>% 
                        mutate(RPE_dPC3Mi_TXR_LB1=(`RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz`+                                   `RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz`)/2)%>%
                        mutate(RPE_iCut_5AZA_62_5_LB2=(`RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz`+                                   `RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz`)/2)%>% 
      mutate(RPE_iCut_ANCHOR3_clone7_DMSO_LB2=(`RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz`+                            `RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_ANCHOR3_clone7_TXR_LB2=(`RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz`+                            `RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_ANCHOR3_polyclonal_LB2=(`RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz`+                            `RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_BIX_2_LB2=(`RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz`+                          
                                  `RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_DMSO_LB2=(`RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz`+                            
                                  `RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_GSK126_500_LB2=(`RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz`+                                                              `RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_LBRKO_LB2=(`RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz`+                                                              `RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_LMNAKO_LB2=(`RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz`+                                                              `RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_LMNB1KO_LB2=(`RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz`+                                                              `RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
      mutate(RPE_iCut_WT_LB2=(`RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz`+                                                              `RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz`)/2)

ABCB1<-Taxol_damid_score_gene_Tom%>%filter(gene_name=="ABCB1")
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_BIX_treatment.pdf")
Taxol_damid_score_gene_Tom%>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_BIX_2_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("Bix treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_BIX_2_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_GSK_treatment.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_GSK126_500_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("GSK treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_GSK126_500_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_AZA_treatment.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_5AZA_62_5_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("AZA treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_5AZA_62_5_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LBRKO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LBRKO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LBRKO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LBRKO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LMNAKO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNAKO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LMNAKO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNAKO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LMNB1KO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNB1KO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LMNB1KO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNB1KO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()

Let’s bind old and new data

Taxol_damid_score_gene_merge<-inner_join(Taxol_damid_score_gene_Tom, Taxol_damid_score_gene, by="gene_name")

Let’s focus on RPE20 the first TXR selected clone First scatter lot according to the original control

ABCB1<-Taxol_damid_score_gene_merge%>%filter(gene_name=="ABCB1")

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_dPC3WTvsTXR_s20.pdf")
Taxol_damid_score_gene_merge %>% ggplot(., aes(x= RPE_dPC3_WT_LB1, y= RPE_dPC3Mi_TXR_LB1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_s20 vs dPC3,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_dPC3_WT_LB1, y= RPE_dPC3Mi_TXR_LB1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()

Now according to Anna the original parental control for TXR_s20 (or RPOE20) is RPE0. This dataset has LB2 data and not LB1, but let see how it looks.

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_RPE0vsTXR_s20.pdf")
Taxol_damid_score_gene_merge %>% ggplot(., aes(x= RPE0, y= RPE_dPC3Mi_TXR_LB1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_s20 vs RPE0,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes( x= RPE0, y= RPE_dPC3Mi_TXR_LB1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)

#dev.off()

Now let’s calculate the delta for all TXR vs relative control

Taxol_damid_score_gene_merge <-Taxol_damid_score_gene_merge%>%
              dplyr::mutate(Delta_TXRs20=RPE_dPC3Mi_TXR_LB1-RPE_dPC3_WT_LB1)%>%
              dplyr::mutate(Delta_TXR3=TXR3-DPURO_3)%>%
              dplyr::mutate(Delta_TXR4=TXR4-DPURO_3)%>%
              dplyr::mutate(Delta_TXR5=TXR5-RPE0)%>%
              dplyr::mutate(Delta_TXR6=TXR6-RPE0)%>%
              dplyr::mutate(Delta_ANCHOR_P_TXR=ANCHOR_P_TXR-ANCHOR_P)%>%
              dplyr::mutate(Delta_ANCHOR_c7_TXR=ANCHOR_c7_TXR_r1-ANCHOR_c7_r1)
ABCB1_delta<-Taxol_damid_score_gene_merge%>%filter(gene_name=="ABCB1")%>%select(gene_name,Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR)



ABCB1_delta_gathered<-ABCB1_delta%>%gather(Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR,key ="Clone", value = "Delta")
t.test(ABCB1_delta_gathered$Delta, alternative = "less")
## 
##  One Sample t-test
## 
## data:  ABCB1_delta_gathered$Delta
## t = -3.1993, df = 6, p-value = 0.009309
## alternative hypothesis: true mean is less than 0
## 95 percent confidence interval:
##        -Inf -0.3986421
## sample estimates:
## mean of x 
## -1.015347
p_val <- 0.009309
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Detachment_score_allclones.pdf")
ABCB1_delta%>%gather(Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR,key ="Clone", value = "Delta")%>%ggplot(., aes(x=gene_name, y=Delta, color=Clone))+geom_beeswarm()+geom_hline(yintercept = 0,  linetype=2)+ylab("Detachment score")+theme_bw()+ ylim(-2.5,0.5)+ annotate("text",x=1,y=0.3,label=paste0('atop(bold("p = ',p_val,'"))'),cex=4,parse=TRUE)

#dev.off()
All_corr<-Taxol_damid_score_gene_merge%>%
        dplyr::select(`RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz`,`RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz`,`RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`,`RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz`,
                      `RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz`, 
                      `RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz`,
                      `RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz`,`RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz`,
                      `pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz`,
                      `ANCHOR_c7_r1`,`ANCHOR_c7_TXR_r1`,
                      `pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz`,              
                      `pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz`,                              `pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz`,              
                      `pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz`,           
                      `pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz`,                                        `pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz`,                                         `pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz`,`RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz`,`RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz`, `RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz`, `RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz`,    `RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz`,`RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz`,   `RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz`)%>%
  cor()

All_corr_df<-as.data.frame(All_corr)
write.csv2(All_corr_df,"/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Allcorrelation.csv", row.names = TRUE)

require(pheatmap)
## Loading required package: pheatmap
require(RColorBrewer)

#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Allcorrelation.pdf")
pheatmap(All_corr)

#dev.off()
#A <- matrix(rnorm(25, 0, 5), nrow = 5, ncol = 5)  
#print(A)
  
# Plot a heatmap 
#heatmap(A,Rowv=NA,Colv=NA,col=heat.colors(3))
  
# Plot a corresponding legend
#legend(x="right", legend=c("min", "med", "max"),fill=heat.colors(3))